import os
path="C:\\Users\\tharu\\Downloads\\Uber\\"
os.listdir(path+'Datasets')
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
uber=pd.read_csv(path+'Datasets'+'\\uber-raw-data-janjune-15_sample.csv')
uber.shape
(100000, 4)
uber.columns
Index(['Dispatching_base_num', 'Pickup_date', 'Affiliated_base_num',
'locationID'],
dtype='object')
uber.head(7)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| 5 | B02617 | 2015-05-03 19:42:00 | B02617 | 87 |
| 6 | B02682 | 2015-01-14 20:21:50 | B02764 | 125 |
uber.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
uber["Pickup_date"]=pd.to_datetime(uber["Pickup_date"])
uber.dtypes
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
uber.isnull().sum()
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1118 locationID 0 dtype: int64
Here We can ignore the null values as we are main focusing on analyzing the montly highest pickup and hourly rush
uber.duplicated().sum()
54
uber[uber.duplicated()==True].head(10)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 15345 | B02682 | 2015-06-23 19:11:00 | B02682 | 164 |
| 16424 | B02682 | 2015-03-22 00:27:00 | B02682 | 249 |
| 17934 | B02764 | 2015-04-19 01:53:00 | B02764 | 144 |
| 19410 | B02682 | 2015-06-28 11:49:00 | B02682 | 107 |
| 23936 | B02682 | 2015-05-10 13:19:00 | B02682 | 234 |
| 29417 | B02764 | 2015-03-31 19:13:00 | B02764 | 68 |
| 37104 | B02682 | 2015-04-30 22:01:00 | B02682 | 161 |
| 39065 | B02682 | 2015-04-20 17:51:00 | B02682 | 143 |
| 39152 | B02764 | 2015-05-28 22:43:00 | B02764 | 230 |
| 39305 | B02682 | 2015-06-14 23:40:00 | B02682 | 79 |
Similarly, We can ignore the duplicates as they are not exact duplicates
uber.size
400000
uber["month"]=uber["Pickup_date"].dt.month_name()
uber["day"]=uber["Pickup_date"].dt.day_name()
uber["hour"]=uber["Pickup_date"].dt.hour
print(uber['month'].head())
print(uber["day"].head())
0 May 1 January 2 March 3 April 4 March Name: month, dtype: object 0 Saturday 1 Tuesday 2 Thursday 3 Friday 4 Monday Name: day, dtype: object
uber.dtypes
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 month object day object hour int64 dtype: object
uber['month'].value_counts()
June 19636 May 18667 April 15995 March 15979 February 15903 January 13820 Name: month, dtype: int64
uber.groupby(by=["month"],as_index=False).size()
| month | size | |
|---|---|---|
| 0 | April | 15995 |
| 1 | February | 15903 |
| 2 | January | 13820 |
| 3 | June | 19636 |
| 4 | March | 15979 |
| 5 | May | 18667 |
uber_month_high=uber.groupby(by=["month"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
uber_month_high
| month | size | |
|---|---|---|
| 3 | June | 19636 |
| 5 | May | 18667 |
| 0 | April | 15995 |
| 4 | March | 15979 |
| 1 | February | 15903 |
| 2 | January | 13820 |
import seaborn as sns
sns.barplot(x=uber_month_high["month"],y=uber_month_high["size"])
<Axes: xlabel='month', ylabel='size'>
uber_month_day_high=uber.groupby(by=["month","day"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
sns.barplot(data=uber_month_day_high,x="month",y="size",hue="day")
<Axes: xlabel='month', ylabel='size'>
uber_hour_high=uber.groupby(by=["day","hour"],axis=0,as_index=False).size().sort_values(by="size",ascending=False)
uber_hour_high
| day | hour | size | |
|---|---|---|---|
| 71 | Saturday | 23 | 1292 |
| 23 | Friday | 23 | 1211 |
| 19 | Friday | 19 | 1205 |
| 70 | Saturday | 22 | 1199 |
| 67 | Saturday | 19 | 1191 |
| ... | ... | ... | ... |
| 27 | Monday | 3 | 100 |
| 148 | Wednesday | 4 | 97 |
| 122 | Tuesday | 2 | 94 |
| 147 | Wednesday | 3 | 64 |
| 123 | Tuesday | 3 | 62 |
168 rows × 3 columns
sns.pointplot(x=uber_hour_high["hour"],y=uber_hour_high["size"],hue=uber_hour_high["day"])
<Axes: xlabel='hour', ylabel='size'>
def period(x):
if x>=6 and x<=12:
return "Morning"
elif x>12 and x<=17:
return "Afternoon"
elif x>17 and x<=23:
return "Evening"
else:
return "Night"
uber["period"]=uber["hour"].apply(period)
uber["period"].value_counts()
Evening 39302 Morning 24427 Afternoon 23679 Night 12592 Name: period, dtype: int64
period=pd.crosstab(uber["month"],uber["period"])
plt.figure(figsize=(8,3))
sns.heatmap(period, cmap="crest",annot=True,fmt=".0f")
<Axes: xlabel='period', ylabel='month'>
uber1=pd.read_csv(path+'Datasets'+'\\Uber-Jan-Feb-FOIL.csv')
uber1.head()
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
sns.boxplot(data=uber1,x="dispatching_base_number",y="active_vehicles")
<Axes: xlabel='dispatching_base_number', ylabel='active_vehicles'>
!pip install plotly
Requirement already satisfied: plotly in c:\users\tharu\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\tharu\anaconda3\lib\site-packages (from plotly) (8.2.2)
import plotly.express as px
px.box(uber1,y="dispatching_base_number",x="active_vehicles",points="all",color="dispatching_base_number")
pip install folium
Requirement already satisfied: folium in c:\users\tharu\anaconda3\lib\site-packages (0.19.4) Requirement already satisfied: branca>=0.6.0 in c:\users\tharu\anaconda3\lib\site-packages (from folium) (0.8.1) Requirement already satisfied: jinja2>=2.9 in c:\users\tharu\anaconda3\lib\site-packages (from folium) (3.1.2) Requirement already satisfied: numpy in c:\users\tharu\anaconda3\lib\site-packages (from folium) (1.24.3) Requirement already satisfied: requests in c:\users\tharu\anaconda3\lib\site-packages (from folium) (2.31.0) Requirement already satisfied: xyzservices in c:\users\tharu\anaconda3\lib\site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\tharu\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.1) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (1.26.16) Requirement already satisfied: certifi>=2017.4.17 in c:\users\tharu\anaconda3\lib\site-packages (from requests->folium) (2024.2.2) Note: you may need to restart the kernel to use updated packages.
import folium
from folium.plugins import HeatMap
l=os.listdir(path+'Datasets')
l
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
l=l[-8:]
l.remove('uber-raw-data-janjune-15.csv')
l.remove("uber-raw-data-janjune-15_sample.csv")
l
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber1=pd.read_csv(path+'Datasets'+'\\uber-raw-data-apr14.csv')
uber1
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 4/1/2014 0:11:00 | 40.7690 | -73.9549 | B02512 |
| 1 | 4/1/2014 0:17:00 | 40.7267 | -74.0345 | B02512 |
| 2 | 4/1/2014 0:21:00 | 40.7316 | -73.9873 | B02512 |
| 3 | 4/1/2014 0:28:00 | 40.7588 | -73.9776 | B02512 |
| 4 | 4/1/2014 0:33:00 | 40.7594 | -73.9722 | B02512 |
| ... | ... | ... | ... | ... |
| 564511 | 4/30/2014 23:22:00 | 40.7640 | -73.9744 | B02764 |
| 564512 | 4/30/2014 23:26:00 | 40.7629 | -73.9672 | B02764 |
| 564513 | 4/30/2014 23:31:00 | 40.7443 | -73.9889 | B02764 |
| 564514 | 4/30/2014 23:32:00 | 40.6756 | -73.9405 | B02764 |
| 564515 | 4/30/2014 23:48:00 | 40.6880 | -73.9608 | B02764 |
564516 rows × 4 columns
uber_final=pd.DataFrame({'Date/Time':[],'Lat':[],'Lon':[],'Base':[]})
for i in l:
uber2=pd.read_csv(path+"Datasets"+'\\'+i)
uber_final=pd.concat([uber2,uber_final])
uber_final.shape
(4534327, 4)
uber_rush=uber_final.groupby(["Lat","Lon"],as_index=False).size().sort_values(by="size",ascending=False)
uber_rush
| Lat | Lon | size | |
|---|---|---|---|
| 32881 | 40.6448 | -73.7819 | 2299 |
| 432167 | 40.7685 | -73.8625 | 2257 |
| 32880 | 40.6448 | -73.7820 | 2079 |
| 33057 | 40.6449 | -73.7822 | 1947 |
| 453734 | 40.7741 | -73.8726 | 1921 |
| ... | ... | ... | ... |
| 226629 | 40.7232 | -73.7992 | 1 |
| 226627 | 40.7232 | -73.7996 | 1 |
| 226626 | 40.7232 | -73.7997 | 1 |
| 226625 | 40.7232 | -73.7999 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
m=folium.Map()
m
HeatMap(uber_rush).add_to(m)
m